Code
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)
library(hexbin)
library(ggExtra)
library(ggrepel)
library(truncnorm)
library(ggpattern)
library(svglite)Libraries
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)
library(hexbin)
library(ggExtra)
library(ggrepel)
library(truncnorm)
library(ggpattern)
library(svglite)Paths
metadata_path <-
"data/processed/metadata_ashton_desj_all_weavepop_H99.csv"
cnv_chroms_path <-
"../Crypto_Desjardins_Ashton/results/02.Dataset/cnv/cnv_chromosomes.tsv"
chrom_metrics_out_path <-
"results/tables/chrom_metrics.tsv"
ploidy_path <- "results/tables/ploidy_from_plots.tsv"Use the metadata table that has all the samples included in the final Crypto_Desjardins_Ashton dataset (n = 1055) and H99.
Select needed columns, remove H99 and get the number of samples per dataset and lineage, per lineage, and total.
metadata <- read.delim(
metadata_path,
header=TRUE,
sep=",",
stringsAsFactor = TRUE)
metadata <- metadata %>%
select(sample, strain, source, lineage, dataset, vni_subdivision)%>%
filter(!strain == "H99") %>%
group_by(dataset, lineage)%>%
mutate(samples_in_dataset_lineage = n_distinct(sample))%>%
ungroup() %>%
group_by(lineage)%>%
mutate(samples_in_lineage = n_distinct(sample))%>%
ungroup()%>%
mutate(total_samples = n_distinct(sample))%>%
droplevels()ploidy <- read.delim(
ploidy_path,
header = TRUE,
sep = "\t",
stringsAsFactors = FALSE)%>%
select(sample, lineage,
accession, chromosome, ploidy, type, gappy,
distribution_pattern, fraction_duplications,
steps, smile, uniform)%>%
distinct()
ploidy$ploidy <- factor(ploidy$ploidy, levels = c("haploid", "diploid", "triploid", "tetraploid"))
ploidy$type <- factor(ploidy$type, levels = c("full", "large", "partial"))cnv_chroms <- read.delim(
cnv_chroms_path,
header = TRUE,
sep = "\t")%>%
mutate(chromosome = str_pad(chromosome, 2, pad = "0"))
cnv_chroms$chromosome <- as.factor(cnv_chroms$chromosome)
levels(cnv_chroms$chromosome) <- paste("chr", levels(cnv_chroms$chromosome), sep="")Calculate
* percent_largest_region : Percentage of length of chromosome covered by the largest CNV region of the chormosome.
* diff_span_cov: Difference between span_percent and coverage_percent.
* n_regions_100kb: Number of regions per 100kb
cnv_chroms <- cnv_chroms %>%
mutate(percent_largest_region = (size_largest_region / length)*100,
diff_span_cov = abs(span_percent - coverage_percent),
n_regions_100kb = (n_regions / length) * 100000)Pivot table to get only one line per chromosome.
Join with depth per chromosome and with ploidy informations.
chrom_metrics <- cnv_chroms %>%
pivot_wider(names_from = cnv,
values_from = c("n_regions","total_size_regions","coverage_percent",
"span_percent","diff_span_cov","size_smallest_region","size_largest_region",
"percent_largest_region", "n_regions_100kb",
"std_regions_size","norm_depth_mean","norm_depth_median",
"smooth_depth_mean","smooth_depth_median")) %>%
left_join(ploidy, by = c("sample", "accession", "lineage", "chromosome"))%>%
left_join(metadata, by = c("sample", "lineage"))Simulation of haploid chromosomes with all proportions (in steps of 5) of percentage of chromosome covered by deletions, duplications and single-copy regions.
Select simulation of percentage of deletion = 5.
n <- 100000
sd_v <- 0.2
p <- data.frame(p_del = c(rep(seq(0,100, by = 5), each = 100)),
p_sc = c(rep(seq(0,100, by = 5), 100)))%>%
mutate(sum = p_del + p_sc)%>%
filter(sum <= 100)%>%
distinct()
sample_summary <- p %>%
mutate(p_dup = 100 - p_del - p_sc,
n_del = p_del * n / 100,
n_sc = p_sc * n / 100,
n_dup = p_dup * n / 100)%>%
rowwise()%>%
mutate(median_depth = median(c(rtruncnorm(n = n_del, a = 0, mean = 0, sd = sd_v),
rnorm(n_sc, mean = 1, sd = sd_v),
rnorm(n_dup, mean = 2, sd = sd_v))),
mean_depth = mean(c(rtruncnorm(n = n_del, a = 0, mean = 0, sd = sd_v),
rnorm(n_sc, mean = 1, sd = sd_v),
rnorm(n_dup, mean = 2, sd = sd_v))))%>%
select(p_del,p_sc,p_dup, median_depth, mean_depth)%>%
mutate(category = paste(p_del,p_sc,p_dup, sep = "_"),
dup_filter = ifelse(p_dup >= 50, "half_duplication", "small_duplication"),
del_filter = ifelse(p_del >= 50, "half_deletion", "small_deletion"))
trend <- filter(sample_summary, p_del == 5)ggplot() + theme_minimal()
p <- ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point()+
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "vertical")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications")
p1 <- ggMarginal(
p,
type = "boxplot",
margins = "both",
size = 5,
groupColour = FALSE,
groupFill = FALSE
)
p1To create a version of this notebook with and without the filter This version is not filtered
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(alpha = 0.1)+
geom_point(aes(color = ploidy, shape = type))+
scale_color_brewer(name = "Ploidy", palette = "Set1") +
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications",
shape = "Type")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median))+
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_point(aes(color = coverage_percent_deletion, shape = type))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = coverage_percent_deletion))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = coverage_percent_deletion))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median))+
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_point(aes(color = n_regions_100kb_duplication,shape = type))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = n_regions_100kb_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = n_regions_100kb_duplication))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_point(aes(color = span_percent_duplication,shape = type)) +
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = span_percent_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = span_percent_duplication))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_point(aes(color = diff_span_cov_duplication,shape = type)) +
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = diff_span_cov_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = diff_span_cov_duplication))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")max_color_range <- paste("2.7 -", max(chrom_metrics$norm_depth_median_duplication, na.rm = TRUE))
colors <- c("#FDE725FF","#95D840FF","#29AF7FFF","#287D8EFF","#440154FF" )
names(colors) <-c("0 - 1.5", "1.6 - 2", "2.1 - 2.3", "2.4 - 2.6", max_color_range)
ggplot(filter(chrom_metrics, smooth_depth_median_duplication < 4),
aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(aes(color = cut(norm_depth_median_duplication,
breaks = c(-Inf, 1.5, 2, 2.3, 2.6, Inf),
labels = names(colors)),
shape = type),
alpha = 1) +
scale_color_manual(values = colors,
name = "Median Depth of dupCNV\nWindows in Chromosome")+
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications")Warning: Removed 8874 rows containing missing values or values outside the scale range
(`geom_point()`).
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_depth_median_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = norm_depth_median_duplication))+
geom_abline(intercept = 0, slope = 1, color = "gray", linetype = "solid")+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_depth_median_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
scale_y_continuous(limits = c(0,5))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = norm_depth_median_duplication))+
geom_abline(intercept = 0, slope = 1, color = "gray", linetype = "solid")+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,5))+
theme_minimal() +
theme(legend.position = "bottom")colors <- c("#FDE725FF","#95D840FF","#29AF7FFF","#287D8EFF","#440154FF" )
names(colors) <-c("0 - 1.5", "1.6 - 2", "2.1 - 2.3", "2.4 - 2.6", "2.7 - 4")
ggplot(filter(chrom_metrics, smooth_depth_median_duplication < 4),
aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(aes(color = cut(smooth_depth_median_duplication,
breaks = c(-Inf, 1.5, 2, 2.3, 2.6, Inf),
labels = names(colors)),
shape = type),
alpha = 1) +
scale_color_manual(values = colors,
name = "Smooth Median Depth of dupCNV\nWindows in Chromosome")+
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications")Warning: Removed 8874 rows containing missing values or values outside the scale range
(`geom_point()`).
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = smooth_depth_median_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = smooth_depth_median_duplication))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = smooth_depth_median_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
scale_y_continuous(limits = c(0,5))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = smooth_depth_median_duplication))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,5))+
theme_minimal() +
theme(legend.position = "bottom")colors <- c("#FDE725FF","#95D840FF","#29AF7FFF","#287D8EFF","#440154FF" )
names(colors) <-c("0 - 5", "6 - 10", "11 -15", "16 - 20", "21 - 100")
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
# geom_point(aes(color = percent_largest_region_single_copy))+
# scale_color_viridis_c(direction = -1)+
geom_point(aes(color = cut(percent_largest_region_single_copy,
breaks = c(0,5,10,15,20,100),
labels = names(colors)),
shape = type)) +
scale_color_manual(values = colors,
name = "Largest Single-Copy Region")+
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "vertical")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = percent_largest_region_single_copy))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = percent_largest_region_single_copy))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")ggplot(filter(chrom_metrics, coverage_percent_duplication >= 20 | norm_chrom_median >= 1.4),
aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(alpha = 0.5, aes(color = lineage,shape = type))+
geom_text_repel(aes(label = sample), size = 3, max.overlaps = 10) +
scale_x_continuous(limits = c(0,100), breaks = seq(0,100, by = 10))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications")Names, lower left
ggplot(filter(chrom_metrics, coverage_percent_duplication >= 20 | norm_chrom_median >= 1.4),
aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(alpha = 0.5, aes(color = lineage,shape = type))+
geom_text_repel(aes(label = paste(sample, chromosome)), size = 3, max.overlaps = 10) +
scale_x_continuous(limits = c(0,50),breaks = seq(0,50, by = 5))+
scale_y_continuous(limits = c(1, 1.6))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications")Names, middle right
ggplot(filter(chrom_metrics, coverage_percent_duplication >= 20 | norm_chrom_median >= 1.4),
aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(alpha = 0.5, aes(color = lineage,shape = type))+
geom_text_repel(aes(label = paste(sample, chromosome)), size = 3, max.overlaps = 10) +
scale_x_continuous(limits = c(50,100),breaks = seq(50,100, by = 5))+
scale_y_continuous(limits = c(1.4,2))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications")Names, upper right
ggplot(filter(chrom_metrics, coverage_percent_duplication >= 20 | norm_chrom_median >= 1.4),
aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(alpha = 0.5, aes(color = lineage,shape = type))+
geom_text_repel(aes(label = paste(sample, chromosome)), size = 3, max.overlaps = 20) +
scale_x_continuous(limits = c(50,100),breaks = seq(50,100, by = 5))+
scale_y_continuous(limits = c(2,2.6))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications")coverage_threshold <- 80duplications_full <- filter(chrom_metrics, coverage_percent_duplication >= coverage_threshold)Number of samples with duplications in each lineage
dup_lineage <- duplications_full %>%
group_by(lineage) %>%
summarise(aneuploid = n_distinct(sample), samples_in_lineage = first(samples_in_lineage))%>%
mutate(euploid = samples_in_lineage - aneuploid)%>%
pivot_longer(cols = c(aneuploid, euploid), names_to = "type", values_to = "n_samples")%>%
mutate(percent = round((n_samples / samples_in_lineage) * 100, 1))%>%
mutate(dup_samples = ifelse(type == "aneuploid", n_samples, ""))
dup_lineage$type <- factor(dup_lineage$type, levels = c("euploid","aneuploid"))
dup_lineage %>%
filter(type == "aneuploid")%>%
select(lineage, n_samples, percent, samples_in_lineage)| lineage | n_samples | percent | samples_in_lineage |
|---|---|---|---|
| VNBI | 2 | 1.6 | 122 |
| VNBII | 3 | 4.7 | 64 |
| VNI | 37 | 4.3 | 853 |
| VNII | 3 | 18.8 | 16 |
Number of samples with duplications in each lineage-source
metadata_source_lineage <- metadata %>%
group_by(lineage,source,samples_in_lineage) %>%
summarize(n_lin_source = n_distinct(sample))
dup_lineage_source <- duplications_full %>%
group_by(lineage,source) %>%
summarise(aneuploid = n_distinct(sample))%>%
right_join(metadata_source_lineage, by = c("lineage", "source"))%>%
replace_na(list(aneuploid = 0))%>%
arrange(lineage, source)%>%
mutate(euploid = n_lin_source - aneuploid)%>%
select(lineage, source, aneuploid, euploid, n_lin_source, samples_in_lineage)%>%
pivot_longer(cols = c(aneuploid, euploid), names_to = "type", values_to = "n_samples")%>%
mutate(percent_samples = round((n_samples / samples_in_lineage) * 100, 1),
percent_source = round((n_lin_source / samples_in_lineage)*100,1),
dup_samples = ifelse(type == "aneuploid" & source == "Clinical", n_samples, ""),
type = str_to_title(type),
category = paste(source, type, sep = " "))
dup_lineage_source$type <- factor(dup_lineage_source$type, levels = c("Euploid","Aneuploid"))
dup_lineage_source$source <- factor(dup_lineage_source$source, levels = c("Environmental","Clinical"))
dup_lineage_source$category <- factor(dup_lineage_source$category, levels =c("Environmental Euploid", "Environmental Aneuploid","Clinical Euploid","Clinical Aneuploid"))
source <- dup_lineage_source%>%
select(lineage, source, n_lin_source, percent_source)%>%
distinct
dup_lineage_source %>%
filter(type == "Aneuploid")%>%
select(lineage, source, n_samples, percent_samples, n_lin_source)| lineage | source | n_samples | percent_samples | n_lin_source |
|---|---|---|---|---|
| VNBI | Clinical | 2 | 1.6 | 51 |
| VNBI | Environmental | 0 | 0.0 | 71 |
| VNBII | Clinical | 3 | 4.7 | 61 |
| VNBII | Environmental | 0 | 0.0 | 3 |
| VNI | Clinical | 37 | 4.3 | 825 |
| VNI | Environmental | 0 | 0.0 | 28 |
| VNII | Clinical | 3 | 18.8 | 15 |
| VNII | Environmental | 0 | 0.0 | 1 |
Number of duplicated chromosomes per sample
dup_sample <- duplications_full %>%
group_by(dataset,lineage, sample, strain, source) %>%
summarise(n_chroms = n_distinct(chromosome),
chromosomes = paste(chromosome, collapse = ", ")) %>%
arrange(desc(n_chroms))
dup_sample| dataset | lineage | sample | strain | source | n_chroms | chromosomes |
|---|---|---|---|---|---|---|
| Desjardins | VNI | SRS405109 | Bt92 | Clinical | 2 | chr04, chr13 |
| Desjardins | VNII | SRS417641 | C12 | Clinical | 2 | chr12, chr14 |
| Ashton | VNI | ERS1142697 | 20427_2#6 | Clinical | 1 | chr12 |
| Ashton | VNI | ERS1142778 | 20949_2#15 | Clinical | 1 | chr13 |
| Ashton | VNI | ERS1142807 | 20427_3#21 | Clinical | 1 | chr12 |
| Ashton | VNI | ERS1142815 | 20427_3#26 | Clinical | 1 | chr09 |
| Ashton | VNI | ERS1142819 | 20427_3#30 | Clinical | 1 | chr12 |
| Ashton | VNI | ERS1142869 | 20949_2#42 | Clinical | 1 | chr09 |
| Ashton | VNI | ERS1142878 | 20427_4#13 | Clinical | 1 | chr09 |
| Ashton | VNI | ERS2540945 | 04CN-65-072 | Clinical | 1 | chr13 |
| Ashton | VNI | ERS2540951 | 04CN-64-024 | Clinical | 1 | chr01 |
| Ashton | VNI | ERS2540980 | 04CN-64-011 | Clinical | 1 | chr13 |
| Ashton | VNI | ERS2540986 | 04CN-65-001 | Clinical | 1 | chr12 |
| Ashton | VNI | ERS2541044 | 04CN-64-074 | Clinical | 1 | chr12 |
| Ashton | VNI | ERS2541126 | BMD3144 | Clinical | 1 | chr13 |
| Ashton | VNI | ERS2541138 | 04CN-03-053 | Clinical | 1 | chr12 |
| Ashton | VNI | ERS2541212 | 04CN-03-039 | Clinical | 1 | chr13 |
| Ashton | VNI | ERS2541247 | BMD2209 | Clinical | 1 | chr06 |
| Ashton | VNI | ERS2541251 | 04CN-03-081 | Clinical | 1 | chr06 |
| Ashton | VNI | ERS2541264 | BMD3117 | Clinical | 1 | chr12 |
| Ashton | VNI | ERS2541274 | BMD761 | Clinical | 1 | chr13 |
| Ashton | VNI | ERS2541304 | 04CN-65-056 | Clinical | 1 | chr01 |
| Ashton | VNI | ERS2541312 | 04CN-32-011 | Clinical | 1 | chr06 |
| Ashton | VNI | ERS2541317 | 04CN-64-090 | Clinical | 1 | chr12 |
| Ashton | VNI | ERS2541342 | UI_31647-2 | Clinical | 1 | chr12 |
| Ashton | VNI | ERS542334 | 14892_1#38 | Clinical | 1 | chr06 |
| Ashton | VNI | ERS542397 | 14936_1#6 | Clinical | 1 | chr13 |
| Ashton | VNI | ERS542496 | 14893_1#10 | Clinical | 1 | chr13 |
| Desjardins | VNBI | SRS885841 | NRHc5009.REL.INI | Clinical | 1 | chr14 |
| Desjardins | VNBI | SRS885893 | NRHc5045.ENR.CLIN.ISO | Clinical | 1 | chr13 |
| Desjardins | VNBII | SRS417606 | Bt109 | Clinical | 1 | chr09 |
| Desjardins | VNBII | SRS417652 | MW-RSA2967 | Clinical | 1 | chr12 |
| Desjardins | VNBII | SRS881180 | PMHc1029.ENR.STOR | Clinical | 1 | chr12 |
| Desjardins | VNI | SRS404475 | LP-RSA3042 | Clinical | 1 | chr14 |
| Desjardins | VNI | SRS404481 | Bt139 | Clinical | 1 | chr13 |
| Desjardins | VNI | SRS404518 | Bt134 | Clinical | 1 | chr12 |
| Desjardins | VNI | SRS404772 | Bt141 | Clinical | 1 | chr09 |
| Desjardins | VNI | SRS404793 | MW-RSA6134 | Clinical | 1 | chr12 |
| Desjardins | VNI | SRS405145 | Bt117 | Clinical | 1 | chr09 |
| Desjardins | VNI | SRS409064 | MW-RSA1955 | Clinical | 1 | chr12 |
| Desjardins | VNI | SRS417642 | In2632 | Clinical | 1 | chr12 |
| Desjardins | VNI | SRS520178 | MW-RSA3834 | Clinical | 1 | chr12 |
| Desjardins | VNI | SRS885916 | PMHc1031A.ENR.INI.LP | Clinical | 1 | chr12 |
| Desjardins | VNII | SRS417571 | 8-1 | Clinical | 1 | chr14 |
| Desjardins | VNII | SRS520182 | WM626 | Clinical | 1 | chr14 |
Number of samples with duplications in each group of dataset-lineage-chromosome
dup_dataset_lineage_chromosome <- duplications_full %>%
group_by(dataset,lineage, chromosome) %>%
summarise(n_samples = n_distinct(sample),
samples_in_dataset_lineage = first(samples_in_dataset_lineage))%>%
mutate(percent_samples = round((n_samples / samples_in_dataset_lineage) * 100, 1))%>%
select(dataset,lineage, chromosome, n_samples, samples_in_dataset_lineage, percent_samples)%>%
arrange(chromosome, desc(lineage), desc(n_samples))
dup_dataset_lineage_chromosome| dataset | lineage | chromosome | n_samples | samples_in_dataset_lineage | percent_samples |
|---|---|---|---|---|---|
| Ashton | VNI | chr01 | 2 | 668 | 0.3 |
| Desjardins | VNI | chr04 | 1 | 185 | 0.5 |
| Ashton | VNI | chr06 | 4 | 668 | 0.6 |
| Ashton | VNI | chr09 | 3 | 668 | 0.4 |
| Desjardins | VNI | chr09 | 2 | 185 | 1.1 |
| Desjardins | VNBII | chr09 | 1 | 64 | 1.6 |
| Desjardins | VNII | chr12 | 1 | 16 | 6.2 |
| Ashton | VNI | chr12 | 9 | 668 | 1.3 |
| Desjardins | VNI | chr12 | 6 | 185 | 3.2 |
| Desjardins | VNBII | chr12 | 2 | 64 | 3.1 |
| Ashton | VNI | chr13 | 8 | 668 | 1.2 |
| Desjardins | VNI | chr13 | 2 | 185 | 1.1 |
| Desjardins | VNBI | chr13 | 1 | 122 | 0.8 |
| Desjardins | VNII | chr14 | 3 | 16 | 18.8 |
| Desjardins | VNI | chr14 | 1 | 185 | 0.5 |
| Desjardins | VNBI | chr14 | 1 | 122 | 0.8 |
Number of samples with duplications in each group of lineage-chromosome
dup_lineage_chromosome <- duplications_full%>%
group_by(lineage, chromosome) %>%
summarise(n_samples = n_distinct(sample),
samples_in_lineage = first(samples_in_lineage))%>%
mutate(percent_samples = round((n_samples / samples_in_lineage) * 100, 1))%>%
select(lineage, chromosome, n_samples, samples_in_lineage,percent_samples)%>%
arrange(chromosome, desc(lineage), desc(n_samples))
dup_lineage_chromosome | lineage | chromosome | n_samples | samples_in_lineage | percent_samples |
|---|---|---|---|---|
| VNI | chr01 | 2 | 853 | 0.2 |
| VNI | chr04 | 1 | 853 | 0.1 |
| VNI | chr06 | 4 | 853 | 0.5 |
| VNI | chr09 | 5 | 853 | 0.6 |
| VNBII | chr09 | 1 | 64 | 1.6 |
| VNII | chr12 | 1 | 16 | 6.2 |
| VNI | chr12 | 15 | 853 | 1.8 |
| VNBII | chr12 | 2 | 64 | 3.1 |
| VNI | chr13 | 10 | 853 | 1.2 |
| VNBI | chr13 | 1 | 122 | 0.8 |
| VNII | chr14 | 3 | 16 | 18.8 |
| VNI | chr14 | 1 | 853 | 0.1 |
| VNBI | chr14 | 1 | 122 | 0.8 |
Number of samples with duplications in each group of lineage-dataset
dup_lineage_dataset <- duplications_full%>%
group_by(dataset,lineage) %>%
summarise(n_samples = n_distinct(sample), samples_in_dataset_lineage = first(samples_in_dataset_lineage))%>%
mutate(percent_samples = round((n_samples / samples_in_dataset_lineage) * 100, 1))%>%
select(lineage, n_samples, samples_in_dataset_lineage, percent_samples)%>%
arrange(desc(lineage), desc(n_samples))
dup_lineage_dataset| dataset | lineage | n_samples | samples_in_dataset_lineage | percent_samples |
|---|---|---|---|---|
| Desjardins | VNII | 3 | 16 | 18.8 |
| Ashton | VNI | 26 | 668 | 3.9 |
| Desjardins | VNI | 11 | 185 | 5.9 |
| Desjardins | VNBII | 3 | 64 | 4.7 |
| Desjardins | VNBI | 2 | 122 | 1.6 |
Number of samples with duplications in each chromosome
dup_chromosome <- duplications_full %>%
group_by(chromosome) %>%
summarise(n_samples = n_distinct(sample), total_samples = first(total_samples))%>%
mutate(percent_samples = round((n_samples / total_samples) * 100, 1))%>%
select(chromosome, n_samples,total_samples, percent_samples)%>%
arrange(chromosome, desc(n_samples))
dup_chromosome | chromosome | n_samples | total_samples | percent_samples |
|---|---|---|---|
| chr01 | 2 | 1055 | 0.2 |
| chr04 | 1 | 1055 | 0.1 |
| chr06 | 4 | 1055 | 0.4 |
| chr09 | 6 | 1055 | 0.6 |
| chr12 | 18 | 1055 | 1.7 |
| chr13 | 11 | 1055 | 1.0 |
| chr14 | 5 | 1055 | 0.5 |